Exercises

For the exercises it is best to install a fresh copy of stars, using

install.packages("remotes") # if not already installed
remotes::install_github("r-spatial/stars")

and to install starsdata, which is about 1 Gb in size (don't forget to remove afterwards):

install.packages("starsdata", repos = "http://gis-bigdata.uni-muenster.de/pebesma", type = "source")

What are data cubes?

  • what are data cubes? examples?
  • what defines data cubes?
  • are datacubes three-dimensional?
  • are tables (e.g. data.frames) data cubes?
  • is a raster dataset (e.g. a RasterLayer, or single-band GeoTIFF) a data cube?
  • what defines a dimension of a data cube?
library(stars)
## Loading required package: abind
## Loading required package: sf
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 7.0.0

Vector data cube: panel data

Panel data are time series data collected for a set of subjects (persons, companies, countries etc). That makes them (i) multidimensional, and (ii) spatiotemporal, since the subjects are typically spatial entities (though they might move).

library(ggplot2)
data(Produc, package = "plm")
head(Produc)
##     state year region     pcap     hwy   water    util       pc   gsp    emp
## 1 ALABAMA 1970      6 15032.67 7325.80 1655.68 6051.20 35793.80 28418 1010.5
## 2 ALABAMA 1971      6 15501.94 7525.94 1721.02 6254.98 37299.91 29375 1021.9
## 3 ALABAMA 1972      6 15972.41 7765.42 1764.75 6442.23 38670.30 31303 1072.3
## 4 ALABAMA 1973      6 16406.26 7907.66 1742.41 6756.19 40084.01 33430 1135.5
## 5 ALABAMA 1974      6 16762.67 8025.52 1734.85 7002.29 42057.31 33749 1169.8
## 6 ALABAMA 1975      6 17316.26 8158.23 1752.27 7405.76 43971.71 33604 1155.4
##   unemp
## 1   4.7
## 2   5.2
## 3   4.7
## 4   3.9
## 5   5.5
## 6   7.7
ggplot(Produc) + geom_raster(aes(y = state, x = year, fill = pcap))

this plot shows the raw pcap (public capital stock) values, which mainly demonstrates that large states are large. Normalizing them by dividing through the temporal means, we see more structure:

(s = st_as_stars(Produc, y_decreasing = FALSE))
## stars object with 2 dimensions and 9 attributes
## attribute(s):
##     region           pcap              hwy            water        
##  Min.   :1.000   Min.   :  2627   Min.   : 1827   Min.   :  228.5  
##  1st Qu.:3.000   1st Qu.:  7097   1st Qu.: 3858   1st Qu.:  764.5  
##  Median :5.000   Median : 17572   Median : 7556   Median : 2266.5  
##  Mean   :4.958   Mean   : 25037   Mean   :10218   Mean   : 3618.8  
##  3rd Qu.:7.000   3rd Qu.: 27692   3rd Qu.:11267   3rd Qu.: 4318.7  
##  Max.   :9.000   Max.   :140217   Max.   :47699   Max.   :24592.3  
##      util               pc               gsp              emp         
##  Min.   :  538.5   Min.   :  4053   Min.   :  4354   Min.   :  108.3  
##  1st Qu.: 2488.3   1st Qu.: 21651   1st Qu.: 16502   1st Qu.:  475.0  
##  Median : 7008.8   Median : 40671   Median : 39987   Median : 1164.8  
##  Mean   :11199.5   Mean   : 58188   Mean   : 61014   Mean   : 1747.1  
##  3rd Qu.:11598.5   3rd Qu.: 64796   3rd Qu.: 68126   3rd Qu.: 2114.1  
##  Max.   :80728.1   Max.   :375342   Max.   :464550   Max.   :11258.0  
##      unemp       
##  Min.   : 2.800  
##  1st Qu.: 5.000  
##  Median : 6.200  
##  Mean   : 6.602  
##  3rd Qu.: 7.900  
##  Max.   :18.000  
## dimension(s):
##       from to offset delta refsys point              values x/y
## state    1 48     NA    NA     NA    NA ALABAMA,...,WYOMING [x]
## year     1 17   1986    -1     NA    NA                NULL [y]
s = st_apply(s, 1, function(x) x/mean(x)) %>%
  st_set_dimensions(names = c("year", "state"))
s
## stars object with 2 dimensions and 9 attributes
## attribute(s):
##     region       pcap              hwy             water       
##  Min.   :1   Min.   :0.6705   Min.   :0.6181   Min.   :0.5294  
##  1st Qu.:1   1st Qu.:0.9327   1st Qu.:0.9646   1st Qu.:0.8626  
##  Median :1   Median :1.0146   Median :1.0088   Median :0.9968  
##  Mean   :1   Mean   :1.0000   Mean   :1.0000   Mean   :1.0000  
##  3rd Qu.:1   3rd Qu.:1.0644   3rd Qu.:1.0413   3rd Qu.:1.1286  
##  Max.   :1   Max.   :1.4160   Max.   :1.1928   Max.   :1.6239  
##      util              pc               gsp              emp        
##  Min.   :0.5385   Min.   :0.6231   Min.   :0.6215   Min.   :0.6055  
##  1st Qu.:0.9236   1st Qu.:0.8718   1st Qu.:0.8921   1st Qu.:0.9065  
##  Median :1.0178   Median :1.0041   Median :1.0012   Median :1.0155  
##  Mean   :1.0000   Mean   :1.0000   Mean   :1.0000   Mean   :1.0000  
##  3rd Qu.:1.0727   3rd Qu.:1.1204   3rd Qu.:1.0939   3rd Qu.:1.0920  
##  Max.   :1.7289   Max.   :1.4786   Max.   :1.6435   Max.   :1.4798  
##      unemp       
##  Min.   :0.4340  
##  1st Qu.:0.7989  
##  Median :0.9633  
##  Mean   :1.0000  
##  3rd Qu.:1.1732  
##  Max.   :1.9416  
## dimension(s):
##       from to offset delta refsys point              values
## year     1 17     NA    NA     NA    NA                NULL
## state    1 48     NA    NA     NA    NA ALABAMA,...,WYOMING
pr = as.data.frame(s)
head(pr)
##   year   state region     pcap      hwy    water     util       pc      gsp
## 1    1 ALABAMA      1 1.099395 1.061091 1.166170 1.123685 1.220994 1.269043
## 2    2 ALABAMA      1 1.083229 1.050617 1.137907 1.104474 1.202354 1.228148
## 3    3 ALABAMA      1 1.073425 1.042153 1.128882 1.093014 1.177764 1.182770
## 4    4 ALABAMA      1 1.065874 1.036919 1.126735 1.081551 1.166810 1.107454
## 5    5 ALABAMA      1 1.065665 1.040853 1.119124 1.078759 1.149528 1.057200
## 6    6 ALABAMA      1 1.065680 1.038889 1.120675 1.080527 1.119022 1.077569
##        emp    unemp
## 1 1.164437 1.217836
## 2 1.135630 1.105994
## 3 1.104277 1.366959
## 4 1.057407 1.739766
## 5 1.044436 1.739766
## 6 1.072367 1.366959
ggplot(pr) + geom_raster(aes(y = state, x = year, fill = pcap))

Vector data cube: North Carolina SIDS

The North Carolina SIDS (sudden infant death syndrome) data set, introduced here, is an epidemiological data set containing population (births, BIR), disease cases (SID), and non-white births (NWBIR) for two periods, indicated by the start years 1974 and 1979. The population and time information is spread over columns in a data.frame:

nc = read_sf(system.file("gpkg/nc.gpkg", package="sf"))
nc.df = st_set_geometry(nc, NULL) # m is a regular, non-spatial data.frame
head(nc.df)
## # A tibble: 6 x 14
##    AREA PERIMETER CNTY_ CNTY_ID NAME  FIPS  FIPSNO CRESS_ID BIR74 SID74 NWBIR74
##   <dbl>     <dbl> <dbl>   <dbl> <chr> <chr>  <dbl>    <int> <dbl> <dbl>   <dbl>
## 1 0.114      1.44  1825    1825 Ashe  37009  37009        5  1091     1      10
## 2 0.061      1.23  1827    1827 Alle… 37005  37005        3   487     0      10
## 3 0.143      1.63  1828    1828 Surry 37171  37171       86  3188     5     208
## 4 0.07       2.97  1831    1831 Curr… 37053  37053       27   508     1     123
## 5 0.153      2.21  1832    1832 Nort… 37131  37131       66  1421     9    1066
## 6 0.097      1.67  1833    1833 Hert… 37091  37091       46  1452     7     954
## # … with 3 more variables: BIR79 <dbl>, SID79 <dbl>, NWBIR79 <dbl>

We now mold this table into a 100 (counties) x 3 (categories) x 2 (years) array:

mat = as.matrix(nc.df[c("BIR74", "SID74", "NWBIR74", "BIR79", "SID79", "NWBIR79")])
dim(mat) = c(county = 100, var = 3, year = 2) # make it a 3-dimensional array
# set dimension values to the array:
dimnames(mat) = list(county = nc$NAME, var = c("BIR", "SID", "NWBIR"), year = c(1974, 1979))
# convert array into a stars object
(nc.st = st_as_stars(pop = mat))
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##       pop       
##  Min.   :    0  
##  1st Qu.:    8  
##  Median :  538  
##  Mean   : 1657  
##  3rd Qu.: 1784  
##  Max.   :30757  
## dimension(s):
##        from  to offset delta refsys point              values
## county    1 100     NA    NA     NA    NA  Ashe,...,Brunswick
## var       1   3     NA    NA     NA    NA BIR  , SID  , NWBIR
## year      1   2     NA    NA     NA    NA          1974, 1979

after which we can replace the county names with the county geometries:

(nc.geom <- st_set_dimensions(nc.st, 1, st_geometry(nc)))
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##       pop       
##  Min.   :    0  
##  1st Qu.:    8  
##  Median :  538  
##  Mean   : 1657  
##  3rd Qu.: 1784  
##  Max.   :30757  
## dimension(s):
##      from  to offset delta refsys point
## sfc     1 100     NA    NA  NAD27 FALSE
## var     1   3     NA    NA     NA    NA
## year    1   2     NA    NA     NA    NA
##                                                                 values
## sfc  MULTIPOLYGON (((-81.47276 3...,...,MULTIPOLYGON (((-78.65572 3...
## var                                                BIR  , SID  , NWBIR
## year                                                        1974, 1979

Note that we have now two fields filled for the sfc (simple feature geometry) dimension: refsys and points. What do they mean?

We can compute and plot the sums over the years for each county (1) and variable (2):

plot(st_apply(nc.geom, c(1,2), sum), key.pos = 4) # sum population over year

In order to meaningfully compare disease cases, we standardise incidence rates (SIR), by

\[\mbox{SIR}_i=\frac{c_i/p_i}{\sum_i c_i / \sum_i p_i}\]

with \(c_i\) the incidences and \(p_i\) the corresponding population of county \(i\). For SIR, the value one indicates equality to the mean incidence rate. We first compute the global incidence \(m\):

# split out BIR, SID, NWBIR over attributes:
split(nc.geom, 2) 
## stars object with 2 dimensions and 3 attributes
## attribute(s):
##       BIR             SID             NWBIR      
##  Min.   :  248   Min.   : 0.000   Min.   :    1  
##  1st Qu.: 1177   1st Qu.: 2.000   1st Qu.:  206  
##  Median : 2265   Median : 5.000   Median :  742  
##  Mean   : 3762   Mean   : 7.515   Mean   : 1202  
##  3rd Qu.: 4451   3rd Qu.: 9.000   3rd Qu.: 1316  
##  Max.   :30757   Max.   :57.000   Max.   :11631  
## dimension(s):
##      from  to offset delta refsys point
## sfc     1 100     NA    NA  NAD27 FALSE
## year    1   2     NA    NA     NA    NA
##                                                                 values
## sfc  MULTIPOLYGON (((-81.47276 3...,...,MULTIPOLYGON (((-78.65572 3...
## year                                                        1974, 1979
# sum by category:
(nc.sum = sapply(split(nc.geom, 2), sum)) 
##    BIR    SID  NWBIR 
## 752354   1503 240362
# denominator: mean incidence ratio, averaged over county & years:
(IR = nc.sum[2]/nc.sum[1]) 
##        SID 
## 0.00199773
# standardise each year/counte value by dividing over IR:
nc.SIR = st_apply(nc.geom, c(1,3), function(x) (x[2]/x[1])/IR)
plot(nc.SIR, breaks = c(0,.25,.5,.75,.9,1.1,1.5,2.5,3.5,5), 
  pal = rev(RColorBrewer::brewer.pal(9, "RdBu")))

#library(mapview)
#mapview(breweries)

Vector data cube with two space dimensions: OD matrices

See section 4.3.2 here.

Vector data cubes: why?

Why don't we work on regular data.frame or tbl_df tables?

  • with arrays, it is immediately clear where the missing values are; with tables the records with missing values may be missing (and not NA filled)
  • with tables, you may have duplicate records; no such thing with arrays
  • with tables, order of records is not given a priori; with arrays lookup is much faster, as each dimensions gives an index
  • with arrays you can directly operate over a dimension, or reduce it (e.g. taking the time mean); with tables this is much harder.
  • vector data cubes arise naturally when sampling raster data cubes (up next), or aggregating raster data cubes over polygons

Raster data cubes: raster layer

(g = read_stars(system.file("external/test.grd", package = "raster")))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##    test.grd      
##  Min.   : 138.7  
##  1st Qu.: 294.0  
##  Median : 371.9  
##  Mean   : 425.6  
##  3rd Qu.: 501.0  
##  Max.   :1736.1  
##  NA's   :6022    
## dimension(s):
##   from  to offset delta                       refsys point values x/y
## x    1  80 178400    40 +proj=sterea +lat_0=52.15...    NA   NULL [x]
## y    1 115 334000   -40 +proj=sterea +lat_0=52.15...    NA   NULL [y]
plot(g, col = viridis::viridis(11), breaks = "equal")

Note that:

  • raster files are data cubes: the attribute has values for all combinations of a set of x and y values
  • where vector data cubes have x/y space features in a single dimension, raster data cubes spread x and y over two (x-y raster cells) or three separate (x-y-z voxels) dimensions.
  • this is a regular grid: offset and delta are filled, and geographic coordinates can be computed, e.g. for the x dimension, from 1-based cell index \(i\) by \(x = \mbox{offset} + (i-1) * \mbox{delta}\)
  • index 1 gives the edge, and 1.5 the center of the first grid cell.
  • delta for y is typically negative: going south row indexes increase while y coordinates decrease
  • the dimensions have a refsys (here: a deprecated Proj4 string), allowing a match to other world coordinates (e.g. plotting with leaflet/mapview)
  • what you see are the non-NA cells, the NA cells are not drawn, but they are there; we can show them by converting to features, drawing cell borders grey:
(g.sf = st_as_sf(g, na.rm = FALSE))
## Simple feature collection with 9200 features and 1 field
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 178400 ymin: 329400 xmax: 181600 ymax: 334000
## CRS:            unknown
## First 10 features:
##    test.grd                       geometry
## 1        NA POLYGON ((178400 334000, 17...
## 2        NA POLYGON ((178440 334000, 17...
## 3        NA POLYGON ((178480 334000, 17...
## 4        NA POLYGON ((178520 334000, 17...
## 5        NA POLYGON ((178560 334000, 17...
## 6        NA POLYGON ((178600 334000, 17...
## 7        NA POLYGON ((178640 334000, 17...
## 8        NA POLYGON ((178680 334000, 17...
## 9        NA POLYGON ((178720 334000, 17...
## 10       NA POLYGON ((178760 334000, 17...
plot(g.sf, border = 'grey', pal = viridis::viridis(9), nbreaks = 10)

Raster data cubes: multiple layers

A lot of raster information comes as multi-layer, the simplest being rgb images:

(r = read_stars(system.file("pictures/Rlogo.jpg", package = "rgdal"))) # the old one
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##    Rlogo.jpg    
##  Min.   :  8.0  
##  1st Qu.:181.0  
##  Median :255.0  
##  Mean   :207.9  
##  3rd Qu.:255.0  
##  Max.   :255.0  
## dimension(s):
##      from  to offset delta refsys point values x/y
## x       1 200      0     1     NA    NA   NULL [x]
## y       1 175    175    -1     NA    NA   NULL [y]
## band    1   3     NA    NA     NA    NA   NULL
plot(r, breaks = "equal")

Obviously, such data can be plotted as colors; we will first convert the rgb values to R color values:

(r.rgb = st_rgb(r))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##     rgb3           
##  Length:35000      
##  Class :character  
##  Mode  :character  
## dimension(s):
##   from  to offset delta refsys point values x/y
## x    1 200      0     1     NA    NA   NULL [x]
## y    1 175    175    -1     NA    NA   NULL [y]
r.rgb[[1]][1:3,1]
## [1] "#FFFFFF" "#FFFFFF" "#FFFFFF"

before plotting it:

plot(r.rgb)

Multi-band data is much more common and goes beyond rgb; an exerpt with the 30m bands of a Landsat-7 image is found here:

L7file = system.file("tif/L7_ETMs.tif", package = "stars")
(L7 = read_stars(L7file))
## stars object with 3 dimensions and 1 attribute
## attribute(s):
##   L7_ETMs.tif    
##  Min.   :  1.00  
##  1st Qu.: 54.00  
##  Median : 69.00  
##  Mean   : 68.91  
##  3rd Qu.: 86.00  
##  Max.   :255.00  
## dimension(s):
##      from  to  offset delta                       refsys point values x/y
## x       1 349  288776  28.5 UTM Zone 25, Southern Hem... FALSE   NULL [x]
## y       1 352 9120761 -28.5 UTM Zone 25, Southern Hem... FALSE   NULL [y]
## band    1   6      NA    NA                           NA    NA   NULL

Plotting this uses histogram stretching over all bands. I think of histogram stretching as HDR in monochrome attribute space: each grey tone covers the same area, color breaks are quantiles of the map layer:

plot(L7)

We can also do the stretching over each band individually, meaning there is no common key:

plot(L7, join_zlim = FALSE)

From these bands we can also make color or false color composites:

par(mfrow = c(1, 2))
plot(L7, rgb = c(3,2,1), reset = FALSE, main = "RGB")    # rgb
plot(L7, rgb = c(4,3,2), main = "False color (NIR-R-G)") # false color

Transforming and warping rasters

Suppose we create a 1-degree grid over parts of Europe:

bb = st_bbox(c(xmin = -10, xmax = 20, ymin = 40, ymax = 60), crs = 4326)
(x = st_as_stars(bb, dx = 1, dy = 1))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##     values  
##  Min.   :0  
##  1st Qu.:0  
##  Median :0  
##  Mean   :0  
##  3rd Qu.:0  
##  Max.   :0  
## dimension(s):
##   from to offset delta refsys point values x/y
## x    1 30    -10     1 WGS 84    NA   NULL [x]
## y    1 20     60    -1 WGS 84    NA   NULL [y]

We can plot the grid outline e.g. by:

sf_use_s2(FALSE)
library(rnaturalearth)
ne = ne_countries(returnclass = "sf", continent = 'europe')
ne$pop_dens = units::set_units(ne$pop_est / st_area(ne), 1/(km^2))
plot(ne["pop_dens"], reset = FALSE, extent = bb)
pop = st_rasterize(ne["pop_dens"], x)
plot(st_as_sf(pop, na.rm = FALSE), add = TRUE, border = 'grey')

We can transform this grid e.g. to the ETRS89 / LAEA projection:

(pop.3035 = st_transform(pop, 3035))
## stars object with 2 dimensions and 1 attribute
## attribute(s):
##  pop_dens [1/km^2]
##  Min.   :  0.00   
##  1st Qu.:  0.00   
##  Median : 78.03   
##  Mean   : 80.40   
##  3rd Qu.:123.98   
##  Max.   :417.65   
## dimension(s):
##   from to offset delta                       refsys point
## x    1 30     NA    NA ETRS89-extended / LAEA Eu... FALSE
## y    1 20     NA    NA ETRS89-extended / LAEA Eu... FALSE
##                        values x/y
## x [30x20] 2680703,...,5127800 [x]
## y [30x20] 1933849,...,4195484 [y]
## curvilinear grid
ne.3035 = st_transform(ne, 3035)
plot(pop.3035, border = 'grey', reset = FALSE)
plot(st_geometry(ne.3035), add = TRUE)

Raster data cubes: time and multiple attributes

Operations on data cubes

From raster to vector, vector to raster